suppressPackageStartupMessages(library(tidyverse))
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
wd <- '~/Google Drive/My Drive/Analysis/METTL2A/'
figdir <- paste0(wd, 'Figures/DRS_diffthresh/')
tabledir <- paste0(wd, 'Tables/DRS_diffthresh/')
FUnctions
paste_wd <- function(path) {
paste0(wd, path)
}
filter_intensityup_sites_pval <- function(df, pval) {
sampcomp_results_joined |>
filter(
KS_intensity_pvalue_G < pval &
KS_intensity_pvalue_I < pval
) |>
filter(
c2_median_intensity_G - c1_median_intensity_G > 0 &
c2_median_intensity_I - c1_median_intensity_I > 0
)
}
get_neighbor_seq <- function(df, neighbor_length = 5) {
df |>
select(transcript_id:ref_kmer) |>
left_join(espresso_AsPC1_transcriptome_seqs) |>
mutate(
seq = str_sub(
transcript_seq,
position + 1 - neighbor_length,
position + 5 + neighbor_length
)
)
}
Read data
sampcomp_results_joined <-
read_tsv(
'Tables/DRS/Positions/sampcomp_results_joined_2024-04-09.tsv.gz' |>
paste_wd()
)
## Rows: 5884004 Columns: 67
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (34): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (33): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sampcomp_results_joined
## # A tibble: 5,884,004 × 67
## transcript_id transcript_name position ref_kmer GMM_logit_pvalue_G
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ENST00000264926.7 RAD18-201 1464 TCACA NA
## 2 ENST00000264926.7 RAD18-201 1465 CACAT 1
## 3 ENST00000264926.7 RAD18-201 1466 ACATA NA
## 4 ENST00000264926.7 RAD18-201 1467 CATAA 1
## 5 ENST00000264926.7 RAD18-201 1468 ATAAA NA
## 6 ENST00000264926.7 RAD18-201 1473 AACGA 1
## 7 ENST00000264926.7 RAD18-201 1475 CGATC NA
## 8 ENST00000264926.7 RAD18-201 1486 ACACA NA
## 9 ENST00000264926.7 RAD18-201 1501 CAAGA 1
## 10 ENST00000264926.7 RAD18-201 1502 AAGAC NA
## # ℹ 5,883,994 more rows
## # ℹ 62 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## # GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## # Logit_LOR_G <chr>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## # c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## # c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## # c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
espresso_AsPC1_transcriptome_seqs <-
read_tsv(
'Tables/Database/espresso_AsPC1_transcriptome_seqs_2024-04-22.tsv.gz' |> paste_wd()
)
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, transcript_seq
## dbl (1): transcript_length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_AsPC1_transcriptome_seqs
## # A tibble: 36,717 × 3
## transcript_id transcript_seq transcript_length
## <chr> <chr> <dbl>
## 1 ENST00000339437.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCA… 987
## 2 ENST00000251607.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCA… 2252
## 3 ENST00000420393.5 CAGCGGGGCCGGTAAGCGGGCGCGCGCCGCTCAGAGGGG… 854
## 4 ENST00000698415.1 GATGTATGATGAGTTTAGTTGAATGCTCGTGTTGCTGTC… 6597
## 5 ENST00000698416.1 CATGACTAGTTTTGTGGGTAGCAATGATGTTTAAATGTC… 5500
## 6 ENST00000488263.5 AGGAACTTCATCATGAAGTCTCAAGTAAACGAACATTTT… 4528
## 7 ENST00000424814.5 GAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACC… 2038
## 8 ENST00000231948.9 AGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGCA… 2187
## 9 ENST00000432408.6 GCCTCCTTTGCGGGTAAACAGACATGGCCGGCGAAGGAG… 2203
## 10 ENST00000459840.5 ATGGAGGCATTTAAACTGGGACTGAGATGGGACTGAGTG… 723
## # ℹ 36,707 more rows
Check
sampcomp_results_joined |>
filter_intensityup_sites_pval(pval = .05) |>
get_neighbor_seq(neighbor_length = 0) |>
filter(ref_kmer == seq)
## Joining with `by = join_by(transcript_id)`
## # A tibble: 605 × 7
## transcript_id transcript_name position ref_kmer transcript_seq
## <chr> <chr> <dbl> <chr> <chr>
## 1 ENST00000429711.7 RPL32-204 422 GCCCA AGCCCTTGCGCGCCACCGTCCCTT…
## 2 ENST00000647248.2 RPL35A-211 380 ACCCC CTTCTCTTACCGCCATCTTGGCTC…
## 3 ENST00000647248.2 RPL35A-211 381 CCCCT CTTCTCTTACCGCCATCTTGGCTC…
## 4 ENST00000389680.2 MT-RNR1-201 43 ACACA AATAGGTTTGGTCCTAGCCTTTCT…
## 5 ENST00000389680.2 MT-RNR1-201 57 CCCCG AATAGGTTTGGTCCTAGCCTTTCT…
## 6 ENST00000389680.2 MT-RNR1-201 71 GTTCA AATAGGTTTGGTCCTAGCCTTTCT…
## 7 ENST00000389680.2 MT-RNR1-201 73 TCACC AATAGGTTTGGTCCTAGCCTTTCT…
## 8 ENST00000389680.2 MT-RNR1-201 75 ACCCT AATAGGTTTGGTCCTAGCCTTTCT…
## 9 ENST00000389680.2 MT-RNR1-201 93 ATCAA AATAGGTTTGGTCCTAGCCTTTCT…
## 10 ENST00000389680.2 MT-RNR1-201 138 GCTTA AATAGGTTTGGTCCTAGCCTTTCT…
## # ℹ 595 more rows
## # ℹ 2 more variables: transcript_length <dbl>, seq <chr>
fasta_dir <- 'Fasta/DRS_diffthresh/Neighbor_5/' |> paste_wd()
pval_threshs <- c(.001, .01, .05, .1)
export_neighbor_seq_increased_currentintensity_sites_pval <- function(pval) {
fasta_basename <- paste0('increased_currentintensity_sites_', pval)
sampcomp_results_joined |>
filter_intensityup_sites_pval(pval = pval) |>
get_neighbor_seq(neighbor_length = 5) |>
select(transcript_id:position, seq) |>
mutate(name = paste0(transcript_id, '|', position)) |>
export_tsv(basename = fasta_basename, outdir = fasta_dir) |>
export_as_fasta(
name = name, sequence = seq, fasta_basename = fasta_basename,
outdir = fasta_dir, compression = '')
}
pval_threshs |>
walk(export_neighbor_seq_increased_currentintensity_sites_pval)
## Joining with `by = join_by(transcript_id)`
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.001_2025-07-19.tsv
##
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.001.fa
## Joining with `by = join_by(transcript_id)`
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.01_2025-07-19.tsv
##
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.01.fa
## Joining with `by = join_by(transcript_id)`
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.05_2025-07-19.tsv
##
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.05.fa
## Joining with `by = join_by(transcript_id)`
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.1_2025-07-19.tsv
##
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.1.fa
Check CC motifs
calc_percentage_groupedby_middle_consecutiveC <- function(pval) {
sampcomp_results_joined |>
filter_intensityup_sites_pval(pval = pval) |>
group_by(middleC_info) |>
reframe(n = n()) |>
mutate(
percentage = 100 * n / sum(n),
pval = paste0('pval < ', pval),
) |>
select(pval, everything())
}
percentages_middle_consecutiveCs <-
pval_threshs |>
map(calc_percentage_groupedby_middle_consecutiveC) |>
bind_rows()
percentages_middle_consecutiveCs |>
export_tsv(outdir = tabledir)
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_diffthresh/percentages_middle_consecutiveCs_2025-07-19.tsv
## # A tibble: 16 × 4
## pval middleC_info n percentage
## <chr> <chr> <int> <dbl>
## 1 pval < 0.001 C1 17 8.25
## 2 pval < 0.001 C2 67 32.5
## 3 pval < 0.001 C3 75 36.4
## 4 pval < 0.001 others 47 22.8
## 5 pval < 0.01 C1 28 7.55
## 6 pval < 0.01 C2 138 37.2
## 7 pval < 0.01 C3 129 34.8
## 8 pval < 0.01 others 76 20.5
## 9 pval < 0.05 C1 46 7.60
## 10 pval < 0.05 C2 247 40.8
## 11 pval < 0.05 C3 196 32.4
## 12 pval < 0.05 others 116 19.2
## 13 pval < 0.1 C1 59 7.68
## 14 pval < 0.1 C2 309 40.2
## 15 pval < 0.1 C3 237 30.9
## 16 pval < 0.1 others 163 21.2
percentages_middle_consecutiveCs_barplot <-
percentages_middle_consecutiveCs |>
mutate(middleC_info = factor(middleC_info, levels = c('C3', 'C2', 'C1', 'others'))) |>
ggplot(aes(
x = pval |> as.character(), y = percentage,
fill = middleC_info |> fct_rev())) +
geom_bar(stat = 'identity', colour = 'gray20', size = 0.4) +
scale_y_continuous(breaks = seq(0, 100, 20)) +
scale_fill_manual(
values = c('#BEBEBE', '#c5c5fb', '#7777F5', '#3131c1')
) +
coord_flip()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
percentages_middle_consecutiveCs_barplot |>
ggsave_pdf(
width = 8, height = 4, outdir = figdir
)
